---
title: "Replication Material for: How technological change affects regional electorates. Part 3: Analysis of SOEP data for Figure 8"
output: html_document
author: Nikolas Schöll and Thomas Kurer
editor_options: 
  chunk_output_type: inline
---

Note that to recreate the data set from scratch, access to SOEP data is required.

# Prepare data

Load packages:

```{r 3_1}
knitr::opts_chunk$set(
    warning = FALSE,
    message = FALSE,
    collapse = TRUE
)

options(scipen=999)

pacman::p_load(tidyverse,
               readstata13,
               fastDummies,
               magrittr)

parties = c("greens",  "linke","spd","fdp",  "cdu_csu", "authoritarian_right")

party_labels = c("Grünen", "Linke",  "SPD", "FDP", "CDU/CSU" , "Authoritarian Right")
party_colors = c("#86197A", "#58AB27", "#E30013", "#E3C503","#231F20","#00ADEF", "white")
```

## 1. Occupation, sector and education variables

From SOEP pgen.dta

```{r 3_2}
df_soep_pgen = 
  haven::read_dta("../data/SOEP/SOEP_CORE_v35_stata/pgen.dta") %>% 
  select(pid,
         year = syear,
         nace2 = pgnace2,
         nace = pgnace,
         edu_level = pgpsbil,
         pgkldb92, 
         pgkldb2010
         ) %>% 
  filter(year >= 1994) %>% 
    mutate(nace = ifelse(nace > 0 , nace, NA),
         nace2 = ifelse(nace2 > 0, nace2, NA),
         sector = case_when(nace2 %in% 10:35 ~ "Manufacturing",
                            nace2  > 0 ~ "Non-manufacturing",
                            nace %in% 15:37 ~ "Manufacturing",
                            nace  > 0 ~ "Non-manufacturing"
                            ) %>% 
           factor(levels = c("Non-manufacturing", "Manufacturing")),
         edu = case_when(edu_level %in% c(1,6,8) ~ "low",
                         edu_level == 2 ~ "middle",
                         edu_level %in% 3:4 ~ "high") %>% 
           factor(levels = c("high", "middle", "low"),                  
                  labels = c("High", "Middle", "Low")),
         year_bin = case_when(year %in% 1994:1998 ~ "1994-1998",
                              year %in% 1999:2003 ~ "1999-2003",
                              year %in% 2004:2008 ~ "2004-2008",
                              year %in% 2009:2013 ~ "2009-2013",
                              year %in% 2014:2018 ~ "2014-2018") %>% 
           factor(levels = c("1994-1998", "1999-2003", "2004-2008", "2009-2013", "2014-2018"),
                  labels = c("1994-\n1998", "1999-\n2003", "2004-\n2008", "2009-\n2013", "2014-\n2018"))) %>% 
  select(-edu_level)
```

### From occupation to main task

Umsteigeschluessel-KldB1992-4-Steller-KldB2010-5-Steller.xls: Crosswalk between different vintages of occupational classifications: Klassifikation der Berufe (KLDB) from German employment agency (Bundesagentur für Arbeit) and IAB.

tasks_kldb2010_3.dta: Task classification by occupation from IAB. Source: Dengler, Katharina; Matthes, Britta; Paulus, Wiebke (2014): Occupational Tasks in the German Labour Market \* an alternative measurement on the basis of an expert database. (FDZ-Methodenreport, 12/2014 (en)), Nürnberg, 36 S. <https://fdz.iab.de/187/section.aspx/Publikation/k141204303>

```{r 3_3}
cw_kldb = readxl::read_excel("../data/IAB/Umsteigeschluessel-KldB1992-4-Steller-KldB2010-5-Steller.xls", sheet = 2, skip = 4) %>% 
  transmute(pgkldb92 = `KldB 1992\n(4-Steller)` %>% as.numeric(),
         cw_kldb2010 = `KldB 2010 \n(5-Steller)` %>% as.numeric() %>% `/` (100) %>% round())

cw_kldb2010_task = haven::read_dta("../data/IAB/tasks_kldb2010_3.dta") %>% 
  select(kldb_2010 = kldb2010_3, year = jahr, main_task_code = haupttask) %>% 
  filter(year == 2011) %>% #exist for years between 2011 and 2013
  select(-year) %>% 
  mutate(kldb_2010 = kldb_2010 %>% as.numeric())

# merge on the PGEN data set
df_soep_pgen %<>% left_join(cw_kldb, by = "pgkldb92") %>% 
  mutate(pgkldb2010 = (pgkldb2010 / 100) %>% round(),
         kldb_2010 = ifelse(pgkldb2010 > 0, 
                            pgkldb2010,
                            cw_kldb2010)) %>% 
  left_join(cw_kldb2010_task, by = "kldb_2010") %>% 
  mutate(main_task = case_when(main_task_code %in% 1:2 ~ "Cognitive non-routine",
                               main_task_code == 3 ~ "Cognitive routine",
                               main_task_code == 4 ~ "Manual routine",
                               main_task_code == 5 ~ "Manual non-routine"
                               ) %>% 
           factor(levels= c("Cognitive non-routine", "Cognitive routine", "Manual routine", "Manual non-routine"))) %>% 
  ungroup() %>% 
  sample() %>% 
  distinct(pid, year, .keep_all = TRUE) %>%  # choose random of multiple crosswalks
  arrange(pid,year)

rm(cw_kldb, cw_kldb2010_task)
```

## 2. Party support

From pl.dta

```{r 3_4}
df_soep_pl = haven::read_dta("../data/SOEP/pl_only_necessary_variables.dta") %>% 
#df_soep_pl = haven::read_dta("../data/SOEP/SOEP_CORE_v35_stata/pl.dta") %>% 
  select(pid, 
         hid, 
         year = syear,
         party_code = plh0012_h,
         birth_year = ple0010_h,
         gender = pla0009_v2) %>% 
  mutate(age = year - birth_year,
         female = case_when(gender == 1 ~ 0,
                            gender == 2 ~ 1),
         party = case_when(party_code == 1 ~ "spd",
                  party_code %in% c(2,3,13) ~ "cdu_csu",
                  party_code == 4 ~ "fdp",
                  party_code == 5 ~ "greens",
                  party_code == 6 ~ "linke",
                  party_code %in% c(7,27) ~ "authoritarian_right") %>% 
           factor(levels = parties,
                  labels = party_labels)) %>%
  select(pid, hid, year, age, female, party)

```

## 3. State information

From hbrutto.dta

This is used to filter only to West Germany later on.

```{r 3_5, message=FALSE, warning=FALSE}
df_soep_hbrutto = haven::read_dta("../data/SOEP/SOEP_CORE_v35_stata/hbrutto.dta") %>%  
  select(year = syear,
         hid,
         state = bula)
```

## 4. Merge to one data set

```{r 3_6}
df = 
  df_soep_pl %>% 
  left_join(df_soep_pgen, by = c("pid", "year")) %>%  # merge on labor market data
  left_join(df_soep_hbrutto, by = c("hid", "year")) %>% #merge on state info
  filter(age %in% 18:65, #restrict to working age population
         state %in% 1:10) # only West Germany

rm(df_soep_pgen, df_soep_hbrutto, df_soep_pl)
```

## 5. Voting controlled for age

```{r 3_7}

df %<>% 
  mutate(greens = ifelse(party=="Grünen",1,0),
         linke = ifelse(party=="Linke",1,0),
         spd = ifelse(party=="SPD",1,0),         
         fdp = ifelse(party=="FDP",1,0),
         cdu_csu = ifelse(party=="CDU/CSU",1,0),
         authoritarian_right = ifelse(party=="Authoritarian Right",1,0))


# estimate coefficient for effect of age on party support
age_coefficients = list()
for (party_i in parties){
  age_coefficients[[party_i]] = 
    df %>% 
    mutate(party_j = get(party_i)) %>% 
    lm(formula = party_j ~ age) %>% 
    .$coefficients %>% 
    .[2]

}

# calculate party support controlling for age
df %<>% 
  mutate(greens = greens - age_coefficients[["greens"]] * (age - mean(age)),
         linke = linke - age_coefficients[["linke"]] * (age - mean(age)),
         spd = spd - age_coefficients[["spd"]] * (age - mean(age)),
         fdp = fdp - age_coefficients[["fdp"]] * (age - mean(age)),
         cdu_csu = cdu_csu - age_coefficients[["cdu_csu"]] * (age - mean(age)),
         authoritarian_right = authoritarian_right - age_coefficients[["authoritarian_right"]] * (age - mean(age))
         )
```

## Select relevant variables and store

```{r}
df %<>% 
  select(-pid, -hid, -age)

df %>% write_rds("../data/sample_SOEP.RDS")
```

# \### ANALYSIS

## Load data, packages and party names

```{r}
options(scipen=999)

pacman::p_load(tidyverse,
               readstata13,
               fastDummies,
               magrittr)

#load data 
df = read_rds("../data/sample_SOEP.RDS")

parties = c("greens",  "linke","spd","fdp",  "cdu_csu", "authoritarian_right")

party_labels = c("Grünen", "Linke",  "SPD", "FDP", "CDU/CSU" , "Authoritarian Right")
party_colors = c("#86197A", "#58AB27", "#E30013", "#E3C503","#231F20","#00ADEF", "white")
```

# Figure 8: Party support of diﬀerent segments of the workforce over time

## (a) Sector

We differentiate between manufacturing and non-manufacturing based on nace codes (var: p_nace or if not available: p_nace2). Codes 15-37 refer to manufacturing (Nace1) respectively codes 10-35 (Nace2).

```{r 3_8}


#How many observations?
df %>% 
  ggplot(aes(sector)) + 
  geom_bar()

# Evolution over time bins
df %>% 
  filter(!is.na(party),
         !is.na(sector)) %>%
  group_by(year_bin, sector) %>% 
  summarise(across(.cols = parties, ~sum(.), .names = "votes_{col}")) %>% 
  pivot_longer(cols = votes_greens:votes_authoritarian_right, 
               names_prefix = "votes_", 
               names_to = "party", 
               values_to = "votes") %>% 
  mutate(party = party %>% 
           factor(levels = parties,
                  labels = party_labels)) %>% 
  group_by(year_bin, sector) %>% 
  mutate(vote_share = votes / sum(votes),
         CI = 1.96*sqrt(vote_share*(1-vote_share)/sum(votes))) %>% 
  ggplot(aes(x = year_bin,
             y = vote_share,
             color = sector,
             group = sector,
             linetype = sector)) +
  geom_path(aes(), size = 1) +
  geom_errorbar(aes(ymin=vote_share-CI, ymax=vote_share+CI), width=.2) +
  facet_wrap(~ party, scales = "free_y") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x = NULL,
       y = "Party support by workers' industry sector",
       color = NULL,
       linetype = NULL)

ggsave("../results/figures/Figure_8a.pdf", width = 18, height = 12, units = "cm")
```

## (b) Main Task

```{r 3_9}
df %>% 
  ggplot(aes(main_task)) + 
  geom_bar()

# Evolution over time bins
df %>% 
  filter(!is.na(party),
         !is.na(main_task)) %>%
  group_by(year_bin, main_task) %>% 
  summarise(across(.cols = parties, ~sum(.), .names = "votes_{col}")) %>% 
  pivot_longer(cols = votes_greens:votes_authoritarian_right, 
               names_prefix = "votes_", 
               names_to = "party", 
               values_to = "votes") %>% 
  mutate(party = party %>% 
           factor(levels = parties,
                  labels = party_labels)) %>% 
  group_by(year_bin, main_task) %>% 
  mutate(vote_share = votes / sum(votes),
         CI = 1.96*sqrt(vote_share*(1-vote_share)/sum(votes))) %>% 
  ggplot(aes(x = year_bin,
             y = vote_share,
             color = main_task,
             group = main_task,
             linetype = main_task)) +
  geom_path(aes(), size = 1) +
  geom_errorbar(aes(ymin=vote_share-CI, ymax=vote_share+CI), width=.2) +
  facet_wrap(~ party, scales = "free_y") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x = NULL,
       y = "Party support by workers' main task",
       color = NULL,
       linetype = NULL)


ggsave("../results/figures/Figure_8b.pdf", width = 18, height = 12, units = "cm")
```

## (c) Education

Variable used: pgpsbil Differentiates between 1. kein Abschluss / Hauptschule 2. Realschule 3. Fachhochschulreife, Abitur

```{r 3_10}
#How many observations?
df %>% 
  ggplot(aes(edu)) + 
  geom_bar()

# Evolution over time bins
df %>% 
  filter(!is.na(party),
         !is.na(edu)) %>%
  group_by(year_bin, edu) %>% 
  summarise(across(.cols = parties, ~sum(.), .names = "votes_{col}")) %>% 
  pivot_longer(cols = votes_greens:votes_authoritarian_right, 
               names_prefix = "votes_", 
               names_to = "party", 
               values_to = "votes") %>% 
  mutate(party = party %>% 
           factor(levels = parties,
                  labels = party_labels)) %>% 
  group_by(year_bin, edu) %>% 
  mutate(vote_share = votes / sum(votes),
         CI = 1.96*sqrt(vote_share*(1-vote_share)/sum(votes))) %>% 
  ggplot(aes(x = year_bin,
             y = vote_share,
             color = edu,
             group = edu,
             linetype = edu)) +
  geom_path(aes(), size = 1) +
  geom_errorbar(aes(ymin=vote_share-CI, ymax=vote_share+CI), width=.2) +
  facet_wrap(~ party, scales = "free_y") + 
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  theme(legend.position = "bottom") +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x = NULL,
       y = "Party support by workers' education level",
       color = NULL,
       linetype = NULL)

ggsave("../results/figures/Figure_8c.pdf", width = 18, height = 12, units = "cm")
```

```{r}
print("Finished!")
```
